/*******************************************************************************

This code file cleans Census area characteristics data.

*******************************************************************************/

*** Manage settings

	set more off
	clear
	
	* Set globals
	global data "~/Dropbox (MIT)/Research/NYC421a/data"

	* Set working directory
	cd "$data"
	
*** Census Block variables

	import delimited "$data/raw/CensusAreaCharacteristics/final/block_characteristics.csv", encoding(ISO-8859-1) clear

	format cb2010 %18.0f

	tostring cb2010, gen(cb2010_str) format(%18.0f)

	gen cfips = substr(cb2010_str,1,5)
	gen ct2010 = substr(cb2010_str,6,6)
	gen cbg2010 = substr(cb2010_str,12,1)
	gen cb2010_ = substr(cb2010_str,12,4)

	destring cfips ct2010 cbg2010 cb2010_ ct_occ_renter_units, replace
	destring pop_block, force replace

	gen borough = .
	replace borough = 1 if cfips == 36061
	replace borough = 2 if cfips == 36005
	replace borough = 3 if cfips == 36047
	replace borough = 4 if cfips == 36081
	replace borough = 5 if cfips == 36085
	
	egen tmp = rowtotal(sh_white sh_black sh_hispanic sh_asian)
	foreach v of varlist sh_white sh_black sh_hispanic sh_asian popdens {
		replace `v' = . if tmp == 0
	}
	drop tmp
	replace medage = . if medage == 0
	replace sh_renter = . if ct_occ_renter_units == 0
	replace sh_vacant = . if ct_occ_renter_units == 0 & sh_vacant == 0
	
	gen lpopdens = ln(popdens)
	drop popdens
	
	
		* Imputation issue
		bys borough ct2010: gegen sh_renter_1 = mean(sh_renter) [aw=pop_block]
		bys borough ct2010: gegen sh_renter_2 = mean(sh_renter) 
		replace sh_renter = sh_renter_1 if missing(sh_renter)
		replace sh_renter = sh_renter_2 if missing(sh_renter)
		drop sh_renter_*

	drop cfips cb2010 cb2010_str
	drop if cb2010_<1000

	rename cb2010_ cb2010

	order borough ct2010 cbg2010 cb2010

	tempfile cb
	save `cb', replace

*** Census Block Group variables

	import delimited "$data/raw/CensusAreaCharacteristics/final/blockgroup_characteristics.csv", encoding(ISO-8859-1) clear

	format blockgroup %18.0f
	
	tostring blockgroup, gen(blockgroup_str) format(%18.0f)
	
	gen cfips = substr(blockgroup_str,1,5)
	gen ct2010 = substr(blockgroup_str,6,6)
	gen cbg2010 = substr(blockgroup_str,12,1)

	destring cfips ct2010 cbg2010, replace
	
	gen borough = .
	replace borough = 1 if cfips == 36061
	replace borough = 2 if cfips == 36005
	replace borough = 3 if cfips == 36047
	replace borough = 4 if cfips == 36081
	replace borough = 5 if cfips == 36085

	drop cfips blockgroup blockgroup_str
	* drop if cbg2010 == 0
	
	destring medhhinc, gen(medhhinc_) force
	replace medhhinc_ = 250000 if medhhinc_ == . & !missing(medhhinc)
	drop medhhinc
	rename medhhinc_ medhhinc
	gen lmedhhinc = ln(medhhinc)
	drop medhhinc
	
	gen lmedgrossrent = ln(medgrossrent)
	drop medgrossrent
	
	gen lmeangrossrent = ln(meangrossrent)
	drop meangrossrent
	
	order borough ct2010 cbg2010
	
*** Merge together

	merge 1:m borough ct2010 cbg2010 using `cb', nogen
	drop if missing(cb2010)
	
*** Save dataset
	
	order borough ct2010 cbg2010 cb2010

	save "$data/clean/census_area_characteristics.dta", replace
